notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np

Read in the data..



In [2]:

    
data = pd.read_csv('/Users/Frankie/Documents/Dissertation/Data/pancreatic/24hProbeExpressionValues.csv')
data[:5]









    Out[2]:






  
    
      
      AP09_.HG.U133_Plus_2..CEL
      AP100_.HG.U133_Plus_2..CEL
      AP103_.HG.U133_Plus_2..CEL
      AP105_.HG.U133_Plus_2..CEL
      AP11_.HG.U133_Plus_2..CEL
      AP117_.HG.U133_Plus_2..CEL
      AP124_.HG.U133_Plus_2..CEL
      AP128_.HG.U133_Plus_2..CEL
      AP129_.HG.U133_Plus_2..CEL
      AP135_.HG.U133_Plus_2..CEL
      ...
      AP72_.HG.U133_Plus_2..CEL
      AP77_.HG.U133_Plus_2..CEL
      AP79_.HG.U133_Plus_2..CEL
      AP81_.HG.U133_Plus_2..CEL
      AP88_.HG.U133_Plus_2..CEL
      AP91_.HG.U133_Plus_2..CEL
      AP97_.HG.U133_Plus_2..CEL
      AP99_.HG.U133_Plus_2..CEL
      KA8_.HG.U133_Plus_2..CEL
      UHA_AP_08_.HG.U133_Plus_2..CEL
    
  
  
    
      Probe1
      7.007313
      6.582350
      6.802145
      7.359145
      5.598688
      8.608102
      7.343843
      6.950850
      6.124941
      6.240495
      ...
      7.500518
      7.386547
      6.204605
      5.263889
      7.218517
      6.656728
      7.174989
      8.552347
      6.870060
      7.835673
    
    
      Probe2
      9.665835
      9.487955
      9.770240
      9.814704
      9.552441
      10.322249
      8.812083
      9.707691
      9.780077
      9.258532
      ...
      9.712581
      9.913510
      8.688643
      9.116479
      8.887095
      8.921837
      8.935894
      9.901182
      9.573303
      9.862596
    
    
      Probe3
      12.420633
      12.463445
      12.469181
      12.797041
      11.524895
      12.886851
      11.770089
      11.792732
      12.018923
      12.121481
      ...
      12.736876
      12.718448
      12.693598
      12.149493
      11.770906
      12.622852
      12.168525
      11.998917
      12.125662
      12.235938
    
    
      Probe4
      6.794176
      6.462573
      7.023898
      7.071085
      7.224542
      6.927613
      6.522564
      6.887145
      7.086610
      7.568517
      ...
      6.764534
      7.036994
      6.537610
      7.368730
      6.338258
      6.756069
      7.169891
      6.459975
      6.905888
      7.470530
    
    
      Probe5
      3.109143
      2.973142
      3.288617
      3.302783
      3.154333
      3.459464
      3.277148
      3.358144
      3.358739
      3.083314
      ...
      3.094350
      3.520533
      3.011519
      3.242574
      3.053249
      3.085127
      3.251309
      3.722801
      3.216627
      3.037672
    
  

5 rows × 92 columns

The columns are the instances and rows the features so we need to transpose the dataset.



In [3]:

    
data = data.T

Read in the labels...



In [4]:

    
label = pd.read_csv('/Users/Frankie/Documents/Dissertation/Data/pancreatic/24hTargets.csv')



In [5]:

    
label[:5]









    Out[5]:






  
    
      
      z
      X
      FileName
      FileName2
      Investigator
      SampleAP
      AltafOAC
      OAC
      RAC
      DBC
      ...
      Notes
      PS.Checked
      age
      sex
      admission
      time
      onset
      oac
      rac.mild
      dbc.mild
    
  
  
    
      0
      AP09_(HG-U133_Plus_2)
      AP09_(HG-U133_Plus_2)
      AP09_.HG.U133_Plus_2..CEL
      AP09
      KA
      9
      Severe
      Severe
      Severe
      Critical
      ...
      NaN
      Y
      25
      M
      03/08/2010
      11:34
      48
      Severe
      Severe
      Severe
    
    
      1
      AP100_(HG-U133_Plus_2)
      AP100_(HG-U133_Plus_2)
      AP100_.HG.U133_Plus_2..CEL
      AP100
      KA
      100
      Mild
      Mild
      Mild
      Mild
      ...
      NaN
      Y
      54
      M
      18/02/2011
      12:00
      23
      Mild
      Mild
      Mild
    
    
      2
      AP103_(HG-U133_Plus_2)
      AP103_(HG-U133_Plus_2)
      AP103_.HG.U133_Plus_2..CEL
      AP103
      KA
      103
      Severe
      Severe
      Moderate
      Moderate
      ...
      NaN
      Y
      38
      M
      26/02/2011
      19:41
      2
      Severe
      Severe
      Severe
    
    
      3
      AP105_(HG-U133_Plus_2)
      AP105_(HG-U133_Plus_2)
      AP105_.HG.U133_Plus_2..CEL
      AP105
      KA
      105
      Mild
      Mild
      Mild
      Mild
      ...
      NaN
      Y
      35
      M
      01/03/2011
      19:04
      22
      Mild
      Mild
      Mild
    
    
      4
      AP11_(HG-U133_Plus_2)
      AP11_(HG-U133_Plus_2)
      AP11_.HG.U133_Plus_2..CEL
      AP11
      KA
      11
      Severe
      Severe
      NaN
      Severe
      ...
      NaN
      Y
      63
      M
      05/08/2010
      11:00
      2
      Severe
      Severe
      Severe
    
  

5 rows × 26 columns

We are using the OAC labeling...



In [6]:

    
label = label[['FileName', 'OAC']]
label[:5]









    Out[6]:






  
    
      
      FileName
      OAC
    
  
  
    
      0
      AP09_.HG.U133_Plus_2..CEL
      Severe
    
    
      1
      AP100_.HG.U133_Plus_2..CEL
      Mild
    
    
      2
      AP103_.HG.U133_Plus_2..CEL
      Severe
    
    
      3
      AP105_.HG.U133_Plus_2..CEL
      Mild
    
    
      4
      AP11_.HG.U133_Plus_2..CEL
      Severe

Join the data and labels on the FileName, remove any null rows and create a label column with 0s and 1s.



In [7]:

    
joined_tables = label.join(data, on='FileName', how = 'outer')
joined_tables = joined_tables[pd.notnull(joined_tables['Probe1'])]
joined_tables['label'] = np.where(joined_tables['OAC']=='Mild', 0, 1)
joined_tables[:5]









    Out[7]:






  
    
      
      FileName
      OAC
      Probe1
      Probe2
      Probe3
      Probe4
      Probe5
      Probe6
      Probe7
      Probe8
      ...
      Probe54667
      Probe54668
      Probe54669
      Probe54670
      Probe54671
      Probe54672
      Probe54673
      Probe54674
      Probe54675
      label
    
  
  
    
      0
      AP09_.HG.U133_Plus_2..CEL
      Severe
      7.007313
      9.665835
      12.420633
      6.794176
      3.109143
      9.374600
      6.273588
      4.729291
      ...
      13.720537
      15.015420
      14.950763
      10.959515
      9.562740
      9.949088
      2.603344
      3.270354
      4.175657
      1
    
    
      1
      AP100_.HG.U133_Plus_2..CEL
      Mild
      6.582350
      9.487955
      12.463445
      6.462573
      2.973142
      9.806086
      7.111387
      4.982339
      ...
      13.876931
      15.006758
      14.947265
      10.483055
      8.652504
      9.229872
      2.603851
      3.204993
      4.092395
      0
    
    
      2
      AP103_.HG.U133_Plus_2..CEL
      Severe
      6.802145
      9.770240
      12.469181
      7.023898
      3.288617
      9.556610
      6.531708
      4.877959
      ...
      13.933029
      15.028081
      14.989151
      10.354565
      8.798639
      9.238298
      2.710254
      3.332739
      4.404978
      1
    
    
      3
      AP105_.HG.U133_Plus_2..CEL
      Mild
      7.359145
      9.814704
      12.797041
      7.071085
      3.302783
      10.482391
      7.379395
      5.003139
      ...
      13.792453
      14.960185
      14.871316
      11.574815
      9.779834
      10.178292
      2.539879
      3.340121
      4.438165
      0
    
    
      4
      AP11_.HG.U133_Plus_2..CEL
      Severe
      5.598688
      9.552441
      11.524895
      7.224542
      3.154333
      9.310391
      6.452497
      4.946130
      ...
      13.640372
      15.044387
      14.941471
      10.538189
      9.169351
      9.571987
      2.630474
      3.326957
      4.286794
      1
    
  

5 rows × 54678 columns

Drop the FileName and OAC columns and export as a CSV file.



In [8]:

    
joined_tables.drop(['FileName','OAC'], axis=1).to_csv("/Users/Frankie/Desktop/pancreatic.csv",index=False)

	AP09_.HG.U133_Plus_2..CEL	AP100_.HG.U133_Plus_2..CEL	AP103_.HG.U133_Plus_2..CEL	AP105_.HG.U133_Plus_2..CEL	AP11_.HG.U133_Plus_2..CEL	AP117_.HG.U133_Plus_2..CEL	AP124_.HG.U133_Plus_2..CEL	AP128_.HG.U133_Plus_2..CEL	AP129_.HG.U133_Plus_2..CEL	AP135_.HG.U133_Plus_2..CEL	...	AP72_.HG.U133_Plus_2..CEL	AP77_.HG.U133_Plus_2..CEL	AP79_.HG.U133_Plus_2..CEL	AP81_.HG.U133_Plus_2..CEL	AP88_.HG.U133_Plus_2..CEL	AP91_.HG.U133_Plus_2..CEL	AP97_.HG.U133_Plus_2..CEL	AP99_.HG.U133_Plus_2..CEL	KA8_.HG.U133_Plus_2..CEL	UHA_AP_08_.HG.U133_Plus_2..CEL
Probe1	7.007313	6.582350	6.802145	7.359145	5.598688	8.608102	7.343843	6.950850	6.124941	6.240495	...	7.500518	7.386547	6.204605	5.263889	7.218517	6.656728	7.174989	8.552347	6.870060	7.835673
Probe2	9.665835	9.487955	9.770240	9.814704	9.552441	10.322249	8.812083	9.707691	9.780077	9.258532	...	9.712581	9.913510	8.688643	9.116479	8.887095	8.921837	8.935894	9.901182	9.573303	9.862596
Probe3	12.420633	12.463445	12.469181	12.797041	11.524895	12.886851	11.770089	11.792732	12.018923	12.121481	...	12.736876	12.718448	12.693598	12.149493	11.770906	12.622852	12.168525	11.998917	12.125662	12.235938
Probe4	6.794176	6.462573	7.023898	7.071085	7.224542	6.927613	6.522564	6.887145	7.086610	7.568517	...	6.764534	7.036994	6.537610	7.368730	6.338258	6.756069	7.169891	6.459975	6.905888	7.470530
Probe5	3.109143	2.973142	3.288617	3.302783	3.154333	3.459464	3.277148	3.358144	3.358739	3.083314	...	3.094350	3.520533	3.011519	3.242574	3.053249	3.085127	3.251309	3.722801	3.216627	3.037672

	z	X	FileName	FileName2	Investigator	SampleAP	AltafOAC	OAC	RAC	DBC	...	Notes	PS.Checked	age	sex	admission	time	onset	oac	rac.mild	dbc.mild
0	AP09_(HG-U133_Plus_2)	AP09_(HG-U133_Plus_2)	AP09_.HG.U133_Plus_2..CEL	AP09	KA	9	Severe	Severe	Severe	Critical	...	NaN	Y	25	M	03/08/2010	11:34	48	Severe	Severe	Severe
1	AP100_(HG-U133_Plus_2)	AP100_(HG-U133_Plus_2)	AP100_.HG.U133_Plus_2..CEL	AP100	KA	100	Mild	Mild	Mild	Mild	...	NaN	Y	54	M	18/02/2011	12:00	23	Mild	Mild	Mild
2	AP103_(HG-U133_Plus_2)	AP103_(HG-U133_Plus_2)	AP103_.HG.U133_Plus_2..CEL	AP103	KA	103	Severe	Severe	Moderate	Moderate	...	NaN	Y	38	M	26/02/2011	19:41	2	Severe	Severe	Severe
3	AP105_(HG-U133_Plus_2)	AP105_(HG-U133_Plus_2)	AP105_.HG.U133_Plus_2..CEL	AP105	KA	105	Mild	Mild	Mild	Mild	...	NaN	Y	35	M	01/03/2011	19:04	22	Mild	Mild	Mild
4	AP11_(HG-U133_Plus_2)	AP11_(HG-U133_Plus_2)	AP11_.HG.U133_Plus_2..CEL	AP11	KA	11	Severe	Severe	NaN	Severe	...	NaN	Y	63	M	05/08/2010	11:00	2	Severe	Severe	Severe